Load in packages
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.4 ✓ dplyr 1.0.7
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 2.0.1 ✓ forcats 0.5.1
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(tidyverse)
library(ggplot2)
import data:
find missing data:
No missing data. Check data types of each variable:
str(house)
'data.frame': 21613 obs. of 22 variables:
$ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
$ date : chr "20141013T000000" "20141209T000000" "20150225T000000" "20141209T000000" ...
$ price : num 221900 538000 180000 604000 510000 ...
$ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
$ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
$ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
$ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
$ floors : num 1 2 1 1 1 1 2 1 1 2 ...
$ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
$ view : int 0 0 0 0 0 0 0 0 0 0 ...
$ condition : int 3 3 3 5 3 3 3 3 3 3 ...
$ grade : int 7 7 6 7 8 11 7 7 7 7 ...
$ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
$ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
$ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
$ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
$ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
$ lat : num 47.5 47.7 47.7 47.5 47.6 ...
$ long : num -122 -122 -122 -122 -122 ...
$ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
$ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
$ num : int 1 1 1 1 1 1 1 1 1 1 ...
We will definitely need to change the data type for the date column, and potentially look into creating factors for some of the more ordinal variables.
Convert date variabe to date type:
Turning view, condition, and grade into ordered factors:
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
names(house)
[1] "id" "date" "price" "bedrooms" "bathrooms" "sqft_living" "sqft_lot"
[8] "floors" "waterfront" "view" "condition" "grade" "sqft_above" "sqft_basement"
[15] "yr_built" "yr_renovated" "zipcode" "lat" "long" "sqft_living15" "sqft_lot15"
[22] "num"
##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
colSums(house[,quant_vars] == 0)
yr_built yr_renovated floors bedrooms bathrooms sqft_living sqft_lot sqft_above
0 20699 0 13 10 0 0 0
sqft_basement sqft_living15 sqft_lot15
13126 0 0
colSums(house[,quant_vars] == 0)
yr_built yr_renovated floors bedrooms bathrooms sqft_living sqft_lot sqft_above
0 20683 0 0 0 0 0 0
sqft_basement sqft_living15 sqft_lot15
13110 0 0
grid.arrange(sp_floors, bp_floors, ncol = 2, nrow = 1)
`geom_smooth()` using formula 'y ~ x'
names(house)
[1] "price" "bedrooms" "bathrooms" "sqft_living" "sqft_lot" "floors" "waterfront"
[8] "view" "condition" "grade" "sqft_above" "sqft_basement" "sqft_living15" "sqft_lot15"
[15] "age"
hist.data.frame(house[,quant_vars])
click left mouse button to proceed
ggcorrplot(corr,
method = "circle",
lab = TRUE,
# type = "lower",
outline.color = "white",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
summary(fit)
Call:
lm(formula = log(price) ~ waterfront * sqft_living + ., data = house)
Residuals:
Min 1Q Median 3Q Max
-2.60701 -0.23786 0.01237 0.23455 1.77491
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.199e+01 3.322e-02 360.872 < 2e-16 ***
waterfront.L 3.008e-01 4.295e-02 7.003 2.58e-12 ***
sqft_living 2.793e-04 1.049e-05 26.626 < 2e-16 ***
bedrooms -4.121e-02 3.220e-03 -12.798 < 2e-16 ***
bathrooms 1.083e-01 5.542e-03 19.535 < 2e-16 ***
sqft_lot 1.918e-07 8.132e-08 2.358 0.0184 *
floors2 7.979e-02 7.277e-03 10.964 < 2e-16 ***
floors3 3.601e-01 1.566e-02 22.990 < 2e-16 ***
view1 1.477e-01 8.750e-03 16.878 < 2e-16 ***
condition1 3.170e-02 5.500e-03 5.763 8.36e-09 ***
grade1 8.805e-02 5.368e-03 16.404 < 2e-16 ***
sqft_above -3.195e-05 7.210e-06 -4.431 9.42e-06 ***
sqft_basement NA NA NA NA
sqft_living15 1.753e-04 5.535e-06 31.680 < 2e-16 ***
sqft_lot15 -8.582e-07 1.243e-07 -6.903 5.21e-12 ***
age 4.169e-03 1.140e-04 36.582 < 2e-16 ***
waterfront.L:sqft_living -3.660e-07 1.207e-05 -0.030 0.9758
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3426 on 21581 degrees of freedom
Multiple R-squared: 0.5769, Adjusted R-squared: 0.5766
F-statistic: 1962 on 15 and 21581 DF, p-value: < 2.2e-16
summary(fit_les.sqft_basement)
Call:
lm(formula = log(price) ~ waterfront * sqft_living + . - sqft_basement,
data = house)
Residuals:
Min 1Q Median 3Q Max
-2.60701 -0.23786 0.01237 0.23455 1.77491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.199e+01 3.322e-02 360.872 < 2e-16 ***
waterfront.L 3.008e-01 4.295e-02 7.003 2.58e-12 ***
sqft_living 2.793e-04 1.049e-05 26.626 < 2e-16 ***
bedrooms -4.121e-02 3.220e-03 -12.798 < 2e-16 ***
bathrooms 1.083e-01 5.542e-03 19.535 < 2e-16 ***
sqft_lot 1.918e-07 8.132e-08 2.358 0.0184 *
floors2 7.979e-02 7.277e-03 10.964 < 2e-16 ***
floors3 3.601e-01 1.566e-02 22.990 < 2e-16 ***
view1 1.477e-01 8.750e-03 16.878 < 2e-16 ***
condition1 3.170e-02 5.500e-03 5.763 8.36e-09 ***
grade1 8.805e-02 5.368e-03 16.404 < 2e-16 ***
sqft_above -3.195e-05 7.210e-06 -4.431 9.42e-06 ***
sqft_living15 1.753e-04 5.535e-06 31.680 < 2e-16 ***
sqft_lot15 -8.582e-07 1.243e-07 -6.903 5.21e-12 ***
age 4.169e-03 1.140e-04 36.582 < 2e-16 ***
waterfront.L:sqft_living -3.660e-07 1.207e-05 -0.030 0.9758
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3426 on 21581 degrees of freedom
Multiple R-squared: 0.5769, Adjusted R-squared: 0.5766
F-statistic: 1962 on 15 and 21581 DF, p-value: < 2.2e-16
str(house)
'data.frame': 21597 obs. of 15 variables:
$ price : num 221900 538000 180000 604000 510000 ...
$ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
$ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
$ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
$ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
$ floors : Factor w/ 3 levels "1","2","3": 1 2 1 1 1 1 2 1 1 2 ...
$ waterfront : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
$ view : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ condition : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
$ grade : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 1 1 ...
$ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
$ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
$ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
$ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
$ age : num 66 30 88 56 34 20 26 58 61 18 ...